getwd()
## [1] "C:/Users/laura.spencer/Work/red-king_RNASeq-2022/notebooks"
source("../references/biostats.R")
list.of.packages <- c("DESeq2", "RCurl", "tidyverse", "vegan", "pheatmap", "pastecs", "factoextra", "FactoMineR", "RColorBrewer", "tibble", "reshape2", "plotly", "cowplot", "clipr", "janitor", "ggpubr", "forcats") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
# Install DESeq2
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
#BiocManager::install("DESeq2")
require(DESeq2)
load(file = "../results/gene-counts") #object = counts
load(file="../data/sample.info") #object = sample.info
load(file = "../results/gene-counts-trans") #object = counts.t
NOTE: As of January 19th, 2022 I will not remove any low-count genes.
keep <- colSums(counts.t) >= 10
counts.ts <- counts.t[,keep]
print(paste("# genes remaining after pre-filtering:", ncol(counts.ts)))
## [1] "# genes remaining after pre-filtering: 22460"
print(paste("# of genes dropped:", ncol(counts.t) - ncol(counts.ts), sep=" "))
## [1] "# of genes dropped: 5827"
# What's the average no. of genes per sample?
data.frame(rowSums(counts.t != 0)) %>%
dplyr::rename(count.total = 1) %>%
rownames_to_column(var="sample") %>%
summarise(mean=mean(count.total), sd=sd(count.total), se=sd/sqrt(length(sample)),
min=min(count.total), max=max(count.total))
## mean sd se min max
## 1 20549.33 362.5444 55.94181 19547 21210
# No. of reads per sample?
data.frame(rowSums(counts.t)) %>%
dplyr::rename(read.total = 1) %>%
rownames_to_column(var="sample") %>% #summary()
summarise(mean=mean(read.total), sd=sd(read.total), se=sd/sqrt(length(sample))) #use this to average across all samples
## mean sd se
## 1 14227057 2171890 335129.9
ggplotly(
ggplot(data = data.frame(rowSums(counts.t != 0)) %>%
dplyr::rename(count.total = 1) %>%
rownames_to_column(var="sample")) +
geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total # genes by sample") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()))
# note: you'll need to press return in the console for all plots
#foa.plots(counts.ts)
# merge count data with sample key, reset row names as sample names, and arrange by infection, then temperature, then day
counts.tk <- merge(x=sample.info[,c("Sample", "Tank", "Treatment", "Treatment_Tank")], by.x="Sample", y=counts.ts, by.y="row.names") %>%
arrange(Treatment, Tank) %>% column_to_rownames(var="Sample") %>% droplevels()
head(counts.tk[1:24]) #check out results of merge/arrange
## Tank Treatment Treatment_Tank evm.TU.HiC_scaffold_97.1
## Tank_1_Crab_1 1 Ambient Ambient.1 11
## Tank_1_Crab_2 1 Ambient Ambient.1 12
## Tank_1_Crab_3 1 Ambient Ambient.1 16
## Tank_2_Crab_1 2 Ambient Ambient.2 8
## Tank_2_Crab_2 2 Ambient Ambient.2 21
## Tank_2_Crab_3 2 Ambient Ambient.2 37
## evm.TU.HiC_scaffold_86.1 evm.TU.HiC_scaffold_86.2
## Tank_1_Crab_1 3 1
## Tank_1_Crab_2 2 0
## Tank_1_Crab_3 4 1
## Tank_2_Crab_1 8 1
## Tank_2_Crab_2 0 3
## Tank_2_Crab_3 6 2
## evm.TU.HiC_scaffold_42.1 evm.TU.HiC_scaffold_22.1
## Tank_1_Crab_1 3794 2
## Tank_1_Crab_2 4720 4
## Tank_1_Crab_3 6307 8
## Tank_2_Crab_1 4243 4
## Tank_2_Crab_2 6527 1
## Tank_2_Crab_3 4750 0
## evm.TU.HiC_scaffold_22.2 evm.TU.HiC_scaffold_29.1
## Tank_1_Crab_1 53 545
## Tank_1_Crab_2 112 609
## Tank_1_Crab_3 146 727
## Tank_2_Crab_1 67 980
## Tank_2_Crab_2 48 876
## Tank_2_Crab_3 23 846
## evm.TU.HiC_scaffold_28.2 evm.TU.HiC_scaffold_50.1
## Tank_1_Crab_1 7 351
## Tank_1_Crab_2 5 330
## Tank_1_Crab_3 7 443
## Tank_2_Crab_1 25 424
## Tank_2_Crab_2 20 555
## Tank_2_Crab_3 28 314
## evm.TU.HiC_scaffold_50.2 evm.TU.HiC_scaffold_92.1
## Tank_1_Crab_1 721 155
## Tank_1_Crab_2 474 235
## Tank_1_Crab_3 375 339
## Tank_2_Crab_1 389 120
## Tank_2_Crab_2 768 46
## Tank_2_Crab_3 767 88
## evm.TU.HiC_scaffold_92.2 evm.TU.HiC_scaffold_92.3
## Tank_1_Crab_1 54 26
## Tank_1_Crab_2 56 53
## Tank_1_Crab_3 55 154
## Tank_2_Crab_1 74 41
## Tank_2_Crab_2 60 10
## Tank_2_Crab_3 73 20
## evm.TU.HiC_scaffold_92.4 evm.TU.HiC_scaffold_103.1
## Tank_1_Crab_1 5 6
## Tank_1_Crab_2 14 11
## Tank_1_Crab_3 29 16
## Tank_2_Crab_1 2 14
## Tank_2_Crab_2 5 16
## Tank_2_Crab_3 3 14
## evm.TU.HiC_scaffold_102.1 evm.TU.HiC_scaffold_29.2
## Tank_1_Crab_1 20 4142
## Tank_1_Crab_2 16 6019
## Tank_1_Crab_3 27 6202
## Tank_2_Crab_1 26 5295
## Tank_2_Crab_2 34 3604
## Tank_2_Crab_3 23 4752
## evm.TU.HiC_scaffold_29.3 evm.TU.HiC_scaffold_29.4
## Tank_1_Crab_1 53 11
## Tank_1_Crab_2 27 15
## Tank_1_Crab_3 28 6
## Tank_2_Crab_1 97 47
## Tank_2_Crab_2 53 14
## Tank_2_Crab_3 70 23
## evm.TU.HiC_scaffold_29.5 evm.TU.HiC_scaffold_78.1
## Tank_1_Crab_1 0 1483
## Tank_1_Crab_2 0 1224
## Tank_1_Crab_3 0 1075
## Tank_2_Crab_1 1 1904
## Tank_2_Crab_2 4 1270
## Tank_2_Crab_3 0 1899
#counts.tk %>% dplyr::select(starts_with("evm")) #this is code to get only the gene columns
NOTE: scale=“column” b/c range of counts is so huge, so counts have been scaled
pheatmap(data.matrix(counts.tk %>% dplyr::select(starts_with("evm"))), Rowv=NA, Colv=NA, na.rm = TRUE, xlab = NA,
show_colnames =FALSE, cluster_cols = FALSE, cluster_rows = TRUE,
scale="column", color=c("dodgerblue3", "goldenrod1"),
main = "RKC gene counts", annotation_row=counts.tk[,c("Treatment", "Tank")])
NOTE: It is absolutely critical that the columns of the count matrix and the rows of the column data (information about samples) are in the same order. DESeq2 will not make guesses as to which column of the count matrix belongs to which row of the column data, these must be provided to DESeq2 already in consistent order.
all(rownames(counts.tk) == counts.tk %>% dplyr::select(starts_with("evm")) %>% t() %>% colnames()) #check that rownames of untransformed matrix match column names of transformed matrix. Should print 'TRUE'
## [1] TRUE
dds.pH <- DESeqDataSetFromMatrix(countData = counts.tk[,grepl("evm", colnames(counts.tk))] %>% t(),
colData = counts.tk[,"Treatment", drop=FALSE] ,
design = ~ Treatment)
dds.tank <- DESeqDataSetFromMatrix(countData = counts.tk[,grepl("evm", colnames(counts.tk))] %>% t(),
colData = counts.tk[,"Tank", drop=FALSE] ,
design = ~ Tank)
dds.treat.tank <- DESeqDataSetFromMatrix(countData = counts.tk[,grepl("evm", colnames(counts.tk))] %>% t(),
colData = counts.tk[,"Treatment_Tank", drop=FALSE] ,
design = ~ Treatment_Tank)
blind=FALSE b/c we are interested in differences explained by experimental design, and may wish to use this transformed data in downstream analyses.vsd.pH <- varianceStabilizingTransformation(dds.pH, blind=FALSE)
vsd.tank <- varianceStabilizingTransformation(dds.tank, blind=FALSE)
vsd.treat.tank <- varianceStabilizingTransformation(dds.treat.tank, blind=FALSE)
NOTE: Hover over points to see the sample numbers
# PCA with points color coded by pH Treatment
ggplotly(
plotPCA(vsd.pH, intgroup="Treatment") +
ggtitle("PCA by Treatment (var-stabilizing transformed)") +
geom_point(size=3, aes(text=colnames(vsd.pH))) +
theme_minimal()+ stat_ellipse(), tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# PCA with points color coded by tank
ggplotly(
plotPCA(vsd.tank, intgroup="Tank") +
ggtitle("PCA by Tank (var-stabilizing transformed)") +
geom_point(size=3, aes(text=colnames(vsd.tank))) +
theme_minimal(), tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# PCA with points color coded by treatment + tank
ggplotly(
plotPCA(vsd.treat.tank, intgroup="Treatment_Tank") +
ggtitle("PCA by Treatment & Tank (var-stabilizing transformed)") +
geom_point(size=3, aes(text=colnames(vsd.treat.tank))) +
theme_minimal(), tooltip = "text")
## Warning: Ignoring unknown aesthetics: text
# Ambient pH (8.0) tanks: 1,2,7,10,11
# Moderate pH (7.8) tanks: 3,4,9,20
# Low pH (7.5) tanks: 5,13,15,16,18
PCA.data.pH <- plotPCA(vsd.pH, intgroup=c("Treatment"), returnData=TRUE)
save(PCA.data.pH, file="../results/PCA.data.pH")
# extract treatment info from VSD transformation
#vsd.df <- as.data.frame(colData(vsd)[,c("population", "pCO2.parent")])
# generate heatmap from untransformed counts
#pheatmap(counts(dds), cluster_rows=FALSE, show_rownames=FALSE,
# cluster_cols=T, annotation_col=vsd.df, scale = "row", main="QuantSeq, untransformed data (but scaled by rows")
# generate heatmap from VSD counts
#pheatmap(assay(vsd), cluster_rows=FALSE, show_rownames=FALSE,
# cluster_cols=T, annotation_col=vsd.df, main = "QuantSeq, VSD-transformed")
Another use of the transformed data is sample clustering. Here, we apply the dist function to the transpose of the transformed count matrix to get sample-to-sample distances.
A heatmap of this distance matrix gives us an overview over similarities and dissimilarities between samples. We have to provide a hierarchical clustering hc to the heatmap function based on the sample distances, or else the heatmap function would calculate a clustering based on the distances between the rows/columns of the distance matrix.
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")) )(255)
sampleDistss <- dist(t(assay(vsd.pH)))
sampleDistMatrixs <- as.matrix(sampleDistss)
# Here we only show pH treatment
rownames(sampleDistMatrixs) <- vsd.pH$Treatment #set row names
colnames(sampleDistMatrixs) <- NULL
pheatmap(sampleDistMatrixs,
clustering_distance_rows=sampleDistss,
clustering_distance_cols=sampleDistss,
col=rev(colors), fontsize = 6)
# Here we also show tank number
sampleDistss <- dist(t(assay(vsd.treat.tank)))
sampleDistMatrixs <- as.matrix(sampleDistss)
rownames(sampleDistMatrixs) <- vsd.treat.tank$Treatment_Tank #set row names
colnames(sampleDistMatrixs) <- NULL
pheatmap(sampleDistMatrixs,
clustering_distance_rows=sampleDistss,
clustering_distance_cols=sampleDistss,
col=rev(colors), fontsize = 6)
DESeq to assess differential expressiondds.DESeq.pH <- DESeq(dds.pH)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 86 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
#colData(dds.DESeq.pH)
print("Comparison: Ambient pH vs. Moderate pH")
## [1] "Comparison: Ambient pH vs. Moderate pH"
summary(res.pco2.AM <- results(dds.DESeq.pH, contrast=c("Treatment", "Ambient", "Moderate"), alpha=0.05))
##
## out of 22460 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 603, 2.7%
## LFC < 0 (down) : 227, 1%
## outliers [1] : 0, 0%
## low counts [2] : 7403, 33%
## (mean count < 13)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, Ambient vs. Moderate:", sum(res.pco2.AM$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, Ambient vs. Moderate: 830"
print("Comparison: Ambient pH vs. Low pH")
## [1] "Comparison: Ambient pH vs. Low pH"
summary(res.pco2.AL <- results(dds.DESeq.pH, contrast=c("Treatment", "Ambient", "Low"), alpha=0.05))
##
## out of 22460 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1067, 4.8%
## LFC < 0 (down) : 430, 1.9%
## outliers [1] : 0, 0%
## low counts [2] : 6967, 31%
## (mean count < 11)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, Ambient vs. Low:", sum(res.pco2.AL$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, Ambient vs. Low: 1497"
print("Comparison: Moderate pH vs. Low pH")
## [1] "Comparison: Moderate pH vs. Low pH"
summary(res.pco2.ML <- results(dds.DESeq.pH, contrast=c("Treatment", "Moderate", "Low"), alpha=0.05))
##
## out of 22460 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 12, 0.053%
## LFC < 0 (down) : 5, 0.022%
## outliers [1] : 0, 0%
## low counts [2] : 9580, 43%
## (mean count < 27)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, Moderate vs. Low:", sum(res.pco2.ML$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, Moderate vs. Low: 17"
#Subset the DESeq results for only those statistically sign. ones
diffex.AM <- subset(res.pco2.AM, padj < 0.05)
diffex.AL <- subset(res.pco2.AL, padj < 0.05)
diffex.ML <- subset(res.pco2.ML, padj < 0.05)
### Generate counts matrix With DEGs among any pH treatment
diffex.pH.counts <- subset(counts(dds.DESeq.pH), rownames(dds.DESeq.pH) %in% unique(c(rownames(diffex.AM),rownames(diffex.AL),rownames(diffex.ML))))
# Create a dataframe to annotate heatmap with treatments
dds.pH.df <- sample.info[match(colnames(diffex.pH.counts), sample.info$Sample),c("Sample", "Treatment", "Tank")] %>%
remove_rownames() %>% column_to_rownames(var = "Sample")
# Make sure treatment order is correct
all(colnames(diffex.pH.counts) == rownames(dds.pH.df)) #double check that samples are still in same order
## [1] TRUE
# All DEGs among any pH treatment
pheatmap(diffex.pH.counts, cluster_rows=TRUE, cluster_cols=FALSE,
show_rownames=FALSE, na.rm=TRUE, annotation_colors = list(Treatment=c(Ambient="#2c7bb6", Moderate="#fdae61", Low="#d7191c")),
scale="row", main = "Differentially expressed genes (DEGs) among any pH treatment",
annotation_col=dds.pH.df[,"Treatment", drop=FALSE], fontsize = 8)
# # another way to do a heatmap
# heatmap(diffex.pH.counts, Colv = NA, scale="row", ColSideColors = brewer.pal(3, "Set1")[dds.pH.df$Treatment])
### ----- Generate pairwise comparison heatmaps
# Create count matrices containing DEGs for each pairwise treatment comparison, rows (genes) ordered by pvalue,
diffex.AM.counts <- subset(counts(dds.DESeq.pH), rownames(dds.DESeq.pH) %in% rownames(diffex.AM)) %>% as.data.frame() %>%
dplyr::select((sample.info %>% arrange(Treatment) %>%
filter(Treatment=="Ambient" | Treatment== "Moderate"))$Sample) %>%
rownames_to_column(var = "gene") %>% arrange(match(gene, c(as.data.frame(diffex.AM) %>% arrange(padj) %>% rownames()))) %>%
column_to_rownames(var = "gene") %>% as.matrix()
diffex.AL.counts <- subset(counts(dds.DESeq.pH), rownames(dds.DESeq.pH) %in% rownames(diffex.AL)) %>% as.data.frame() %>%
dplyr::select((sample.info %>% arrange(Treatment) %>%
filter(Treatment=="Ambient" | Treatment== "Low"))$Sample) %>%
rownames_to_column(var = "gene") %>% arrange(match(gene, c(as.data.frame(diffex.AL) %>% arrange(padj) %>% rownames()))) %>%
column_to_rownames(var = "gene") %>% as.matrix()
diffex.ML.counts <- subset(counts(dds.DESeq.pH), rownames(dds.DESeq.pH) %in% rownames(diffex.ML)) %>% as.data.frame() %>%
dplyr::select((sample.info %>% arrange(Treatment) %>%
filter(Treatment=="Moderate" | Treatment== "Low"))$Sample) %>%
rownames_to_column(var = "gene") %>%
arrange(match(gene, c(as.data.frame(diffex.ML) %>% arrange(padj) %>% rownames()))) %>%
column_to_rownames(var = "gene") %>% as.matrix()
pheatmap(diffex.AM.counts, cluster_cols = FALSE, cluster_rows=TRUE,
show_rownames=FALSE, na.rm=TRUE, scale="row",
main = "DEGs among pH 8.0 (ambient) & pH 7.8 (moderate), sorted by p-value",
fontsize = 8, cutree_rows = 2,
annotation_colors = list(Treatment=c(Ambient="#2c7bb6", Moderate="#fdae61")),
annotation_col=dds.pH.df[colnames(diffex.AM.counts),"Treatment", drop=FALSE])
pheatmap(diffex.AL.counts, cluster_cols = FALSE, cluster_rows=TRUE,
show_rownames=FALSE, na.rm=TRUE, scale="row",
main = "DEGs among pH 8.0 (ambient) & pH 7.5 (low), sorted by p-value",
fontsize = 8, cutree_rows = 2,
annotation_colors = list(Treatment=c(Ambient="#2c7bb6", Low="#d7191c")),
annotation_col=dds.pH.df[colnames(diffex.AL.counts),"Treatment", drop=FALSE])
pheatmap(diffex.ML.counts, cluster_cols = FALSE, cluster_rows=TRUE,
show_rownames=TRUE, na.rm=TRUE, scale="row",
main = "DEGs among pH 7.8 (moderate) & pH 7.5 (low), sorted by p-value",
fontsize = 8, cutree_rows = 2,
annotation_colors = list(Treatment=c(Moderate="#fdae61", Low="#d7191c")),
annotation_col=dds.pH.df[colnames(diffex.ML.counts),"Treatment", drop=FALSE])
Here I will do a quick enrichment analysis using the online tool DAVID. I copy the Uniprot SPIDs for the annotated DEGs and all annotated genes examined into the DAVID Functional Annotation Tools. I do so for each pairwise pH comparison.
# DEGs between Ambient & Moderate (includes 733 annotated genes, when outlier removed it's 344)
diffex.AM %>% as.data.frame() %>% rownames_to_column("gene") %>% dplyr::select(gene, padj) %>%
left_join(P.plat.blast.GO %>% dplyr::select(geneID, SPID, `Protein names`), by = c("gene"="geneID")) %>%
dplyr::select(SPID) %>% na.omit() %>% unlist() %>% as.vector() %>% write_clip()
# DEGs between Ambient & Low (includes 967 annotated genes, when outlier removed it's 679)
diffex.AL %>% as.data.frame() %>% rownames_to_column("gene") %>% dplyr::select(gene, padj) %>%
left_join(P.plat.blast.GO %>% dplyr::select(geneID, SPID, `Protein names`), by = c("gene"="geneID")) %>%
dplyr::select(SPID) %>% na.omit() %>% unlist() %>% as.vector() %>% write_clip()
# DEGs between Moderate & Low (includes just 2 annotated genes, 3 when outlier removed) <----- None of these were registered by DAVID
diffex.ML %>% as.data.frame() %>% rownames_to_column("gene") %>% dplyr::select(gene, padj) %>%
left_join(P.plat.blast.GO %>% dplyr::select(geneID, SPID, `Protein names`), by = c("gene"="geneID")) %>%
dplyr::select(SPID) %>% na.omit() %>% unlist() %>% as.vector() %>% write_clip()
# All genes (background, includes 7,238 annotated genes)
res.pco2.AM %>% as.data.frame() %>% rownames_to_column("gene") %>% dplyr::select(gene, padj) %>%
left_join(P.plat.blast.GO %>% dplyr::select(geneID, SPID, `Protein names`), by = c("gene"="geneID")) %>%
dplyr::select(SPID) %>% na.omit() %>% unlist() %>% as.vector() %>% write_clip()
Enriched biological processes (and associated gene lists) are saved to GitHub: – Ambient pH vs. Moderate pH, results/DAVID-Enriched-BP_Amb-Mod.txt
– Ambient pH vs. Moderate pH, results/DAVID-Enriched-BP_Amb-Low.txt
– Moderate pH vs. Low pH, NONE
Updated Enriched biological processes with outlier sample removed (Tank 7 Crab 4) – Ambient pH vs. Moderate pH, results/DAVID-Enriched-BP_Amb-Mod_no-outlier-T7C4.txt
– Ambient pH vs. Moderate pH, results/DAVID-Enriched-BP_Amb-Low_no-outlier-T7C4.txt
– Moderate pH vs. Low pH, NONE
Visualize with REVIGO - read in DAVID enrichment results, extract GO terms and PValues and copy to clipboard to paste into REVIGO
# With all samples
read_delim(file="../results/DAVID-Enriched-BP_Amb-Mod.txt", delim = "\t") %>%
mutate(GO = str_extract(Term, "GO(.*?)~")) %>%
mutate(GO = gsub("~", "", GO)) %>% dplyr::select(GO, PValue) %>% na.omit() %>% write_clip()
read_delim(file="../results/DAVID-Enriched-BP_Amb-Low.txt", delim = "\t") %>%
mutate(GO = str_extract(Term, "GO(.*?)~")) %>%
mutate(GO = gsub("~", "", GO)) %>% dplyr::select(GO, PValue) %>% na.omit() %>% write_clip()
# With outlier sample (Tank 7 Crab 4) removed
# Between ambient and moderate pH
read_delim(file="../results/DAVID-Enriched-BP_Amb-Mod_no-outlier-T7C4.txt", delim = "\t") %>%
mutate(GO = str_extract(Term, "GO(.*?)~")) %>%
mutate(GO = gsub("~", "", GO)) %>% dplyr::select(GO, PValue) %>% na.omit() %>% write_clip()
# Between ambient and low pH
read_delim(file="../results/DAVID-Enriched-BP_Amb-Low_no-outlier-T7C4.txt", delim = "\t") %>%
mutate(GO = str_extract(Term, "GO(.*?)~")) %>%
mutate(GO = gsub("~", "", GO)) %>% dplyr::select(GO, PValue) %>% na.omit() %>% write_clip()